From f481a3f9c3d69d1a7a059538547550142ac66792 Mon Sep 17 00:00:00 2001
From: Keir Fraser <keir.fraser@citrix.com>
Date: Sat, 14 Nov 2009 08:09:50 +0000
Subject: [PATCH] xend: Balloon down memory to achive enough DMA32 memory for
 PV guests with PCI pass-through to succesfully launch.

If the user hasn't used dom0_mem=3D bootup parameter, the privileged
domain usurps all of the memory. During launch of PV guests with PCI
pass-through we ratchet down the memory for the privileged domain to
the required memory for the PV guest. However, for PV guests with PCI
pass-through we do not take into account that the PV guest is going to
swap its SWIOTLB memory for DMA32 memory - in fact, swap 64MB of
it. This patch balloon's down the privileged domain so that there are
64MB of DMA32 memory available.

From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Keir Fraser <keir.fraser@citrix.com>
---
 tools/python/xen/lowlevel/xc/xc.c       | 18 ++++++++
 tools/python/xen/xend/XendConfig.py     |  7 ++++
 tools/python/xen/xend/XendDomainInfo.py | 56 ++++++++++++++++++++++++-
 tools/python/xen/xend/XendNode.py       | 12 ++++--
 4 files changed, 87 insertions(+), 6 deletions(-)

diff --git a/tools/python/xen/lowlevel/xc/xc.c b/tools/python/xen/lowlevel/xc/xc.c
index 12ea007a53..4c90579c68 100644
--- a/tools/python/xen/lowlevel/xc/xc.c
+++ b/tools/python/xen/lowlevel/xc/xc.c
@@ -1059,6 +1059,7 @@ static PyObject *pyxc_physinfo(XcObject *self)
     int i, j, max_cpu_id;
     uint64_t free_heap;
     PyObject *ret_obj, *node_to_cpu_obj, *node_to_memory_obj;
+    PyObject *node_to_dma32_mem_obj;
     xc_cpu_to_node_t map[MAX_CPU_ID + 1];
     const char *virtcap_names[] = { "hvm", "hvm_directio" };
 
@@ -1128,10 +1129,27 @@ static PyObject *pyxc_physinfo(XcObject *self)
         Py_DECREF(pyint);
     }
 
+    xc_dom_loginit();
+    /* DMA memory. */
+    node_to_dma32_mem_obj = PyList_New(0);
+
+    for ( i = 0; i < info.nr_nodes; i++ )
+    {
+        PyObject *pyint;
+
+        xc_availheap(self->xc_handle, 0, 32, i, &free_heap);
+        xc_dom_printf("Node:%d: DMA32:%ld\n", i, free_heap);
+        pyint = PyInt_FromLong(free_heap / 1024);
+        PyList_Append(node_to_dma32_mem_obj, pyint);
+        Py_DECREF(pyint);
+    }
+
     PyDict_SetItemString(ret_obj, "node_to_cpu", node_to_cpu_obj);
     Py_DECREF(node_to_cpu_obj);
     PyDict_SetItemString(ret_obj, "node_to_memory", node_to_memory_obj);
     Py_DECREF(node_to_memory_obj);
+    PyDict_SetItemString(ret_obj, "node_to_dma32_mem", node_to_dma32_mem_obj);
+    Py_DECREF(node_to_dma32_mem_obj);
  
     return ret_obj;
 #undef MAX_CPU_ID
diff --git a/tools/python/xen/xend/XendConfig.py b/tools/python/xen/xend/XendConfig.py
index 6a168a264d..0eadf343d3 100644
--- a/tools/python/xen/xend/XendConfig.py
+++ b/tools/python/xen/xend/XendConfig.py
@@ -2111,6 +2111,13 @@ class XendConfig(dict):
     def is_hap(self):
         return self['platform'].get('hap', 0)
 
+    def is_pv_and_has_pci(self):
+        for dev_type, dev_info in self.all_devices_sxpr():
+            if dev_type != 'pci':
+                continue
+            return not self.is_hvm()
+        return False
+
     def update_platform_pci(self):
         pci = []
         for dev_type, dev_info in self.all_devices_sxpr():
diff --git a/tools/python/xen/xend/XendDomainInfo.py b/tools/python/xen/xend/XendDomainInfo.py
index 212d1d3927..f6becb6bcc 100644
--- a/tools/python/xen/xend/XendDomainInfo.py
+++ b/tools/python/xen/xend/XendDomainInfo.py
@@ -2580,7 +2580,8 @@ class XendDomainInfo:
 
 
     def _setCPUAffinity(self):
-        """ Repin domain vcpus if a restricted cpus list is provided
+        """ Repin domain vcpus if a restricted cpus list is provided.
+            Returns the choosen node number.
         """
 
         def has_cpus():
@@ -2597,6 +2598,7 @@ class XendDomainInfo:
                         return True
             return False
 
+        index = 0
         if has_cpumap():
             for v in range(0, self.info['VCPUs_max']):
                 if self.info['vcpus_params'].has_key('cpumap%i' % v):
@@ -2647,6 +2649,54 @@ class XendDomainInfo:
                 cpumask = info['node_to_cpu'][index]
                 for v in range(0, self.info['VCPUs_max']):
                     xc.vcpu_setaffinity(self.domid, v, cpumask)
+        return index
+
+    def _freeDMAmemory(self, node):
+
+	# If we are PV and have PCI devices the guest will
+	# turn on a SWIOTLB. The SWIOTLB _MUST_ be located in the DMA32
+	# zone (under 4GB). To do so, we need to balloon down Dom0 to where
+	# there is enough (64MB) memory under the 4GB mark. This balloon-ing
+	# might take more memory out than just 64MB thought :-(
+	if not self.info.is_pv_and_has_pci():
+		return
+
+	retries = 2000
+	ask_for_mem = 0;
+	need_mem = 0
+	try:		
+	    while (retries > 0):
+		physinfo = xc.physinfo()
+		free_mem = physinfo['free_memory']
+		nr_nodes = physinfo['nr_nodes']
+		node_to_dma32_mem = physinfo['node_to_dma32_mem']
+		if (node > nr_nodes):
+		     return;
+		# Extra 2MB above 64GB seems to do the trick.
+		need_mem = 64 * 1024 + 2048 - node_to_dma32_mem[node]
+		# our starting point. We ask just for the difference to
+		# be have an extra 64MB under 4GB.
+		ask_for_mem = max(need_mem, ask_for_mem);
+		if (need_mem > 0):
+		     log.debug('_freeDMAmemory (%d) Need %dKiB DMA memory. '
+			       'Asking for %dKiB', retries, need_mem,
+			       ask_for_mem)
+
+		     balloon.free(ask_for_mem, self)
+		     ask_for_mem = ask_for_mem + 2048;
+		else:
+		     # OK. We got enough DMA memory.
+		     break
+		retries  = retries - 1
+	except:
+	    # This is best-try after all.
+	    need_mem = max(1, need_mem);
+	    pass
+
+	if (need_mem > 0):
+	    log.warn('We tried our best to balloon down DMA memory to '
+		     'accomodate your PV guest. We need %dKiB extra memory.',
+		     need_mem)
 
     def _setSchedParams(self):
         if XendNode.instance().xenschedinfo() == 'credit':
@@ -2668,7 +2718,7 @@ class XendDomainInfo:
             # repin domain vcpus if a restricted cpus list is provided
             # this is done prior to memory allocation to aide in memory
             # distribution for NUMA systems.
-            self._setCPUAffinity()
+            node = self._setCPUAffinity()
 
             # Set scheduling parameters.
             self._setSchedParams()
@@ -2730,6 +2780,8 @@ class XendDomainInfo:
             if self.info.target():
                 self._setTarget(self.info.target())
 
+            self._freeDMAmemory(node)
+
             self._createDevices()
 
             self.image.cleanupTmpImages()
diff --git a/tools/python/xen/xend/XendNode.py b/tools/python/xen/xend/XendNode.py
index 0fbefef6f8..bb1dad4eab 100644
--- a/tools/python/xen/xend/XendNode.py
+++ b/tools/python/xen/xend/XendNode.py
@@ -872,11 +872,11 @@ class XendNode:
         except:
             str='none\n'
         return str[:-1];
-    def format_node_to_memory(self, pinfo):
+    def format_node_to_memory(self, pinfo, key):
         str=''
         whitespace=''
         try:
-            node_to_memory=pinfo['node_to_memory']
+            node_to_memory=pinfo[key]
             for i in range(0, pinfo['nr_nodes']):
                 str+='%snode%d:%d\n' % (whitespace,
                                         i,
@@ -896,7 +896,10 @@ class XendNode:
         info['total_memory'] = info['total_memory'] / 1024
         info['free_memory']  = info['free_memory'] / 1024
         info['node_to_cpu']  = self.format_node_to_cpu(info)
-        info['node_to_memory'] = self.format_node_to_memory(info)
+        info['node_to_memory'] = self.format_node_to_memory(info,
+					'node_to_memory')
+        info['node_to_dma32_mem'] = self.format_node_to_memory(info,
+					'node_to_dma32_mem')
 
         ITEM_ORDER = ['nr_cpus',
                       'nr_nodes',
@@ -908,7 +911,8 @@ class XendNode:
                       'total_memory',
                       'free_memory',
                       'node_to_cpu',
-                      'node_to_memory'
+                      'node_to_memory',
+                      'node_to_dma32_mem'
                       ]
 
         return [[k, info[k]] for k in ITEM_ORDER]
-- 
2.30.2